In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC
In [2]:
a=pd.read_csv(r'C:\Users\Sridevi\Downloads\city_day1.csv')
print(a)
                City       Date  PM2.5   PM10     NO    NO2    NOx    NH3  \
0          Ahmedabad   1/1/2015    NaN    NaN   0.92  18.22  17.15    NaN   
1          Ahmedabad   1/2/2015    NaN    NaN   0.97  15.69  16.46    NaN   
2          Ahmedabad   1/3/2015    NaN    NaN  17.40  19.30  29.70    NaN   
3          Ahmedabad   1/4/2015    NaN    NaN   1.70  18.48  17.97    NaN   
4          Ahmedabad   1/5/2015    NaN    NaN  22.10  21.42  37.76    NaN   
...              ...        ...    ...    ...    ...    ...    ...    ...   
14870  Visakhapatnam  6/27/2020  15.02  50.94   7.68  25.06  19.54  12.47   
14871  Visakhapatnam  6/28/2020  24.38  74.09   3.42  26.06  16.53  11.99   
14872  Visakhapatnam  6/29/2020  22.91  65.73   3.45  29.53  18.33  10.71   
14873  Visakhapatnam  6/30/2020  16.64  49.97   4.05  29.26  18.80  10.03   
14874  Visakhapatnam   7/1/2020  15.00  66.00   0.40  26.85  14.05   5.20   

          CO    SO2      O3  Benzene  Toluene  Xylene   AQI    AQI_Bucket  
0       0.92  27.64  133.36     0.00     0.02    0.00   NaN           NaN  
1       0.97  24.55   34.06     3.68     5.50    3.77   NaN           NaN  
2      17.40  29.07   30.70     6.80    16.40    2.25   NaN           NaN  
3       1.70  18.59   36.08     4.43    10.14    1.00   NaN           NaN  
4      22.10  39.33   39.31     7.01    18.89    2.78   NaN           NaN  
...      ...    ...     ...      ...      ...     ...   ...           ...  
14870   0.47   8.55   23.30     2.24    12.07    0.73  41.0          Good  
14871   0.52  12.72   30.14     0.74     2.21    0.38  70.0  Satisfactory  
14872   0.48   8.42   30.96     0.01     0.01    0.00  68.0  Satisfactory  
14873   0.52   9.84   28.30     0.00     0.00    0.00  54.0  Satisfactory  
14874   0.59   2.10   17.05      NaN      NaN     NaN  50.0          Good  

[14875 rows x 16 columns]
In [3]:
a.isnull().sum()
Out[3]:
City             0
Date             0
PM2.5         2526
PM10          5645
NO            2232
NO2           2223
NOx           1183
NH3           5009
CO             717
SO2           2330
O3            2482
Benzene       1575
Toluene       1991
Xylene        7410
AQI           2666
AQI_Bucket    2666
dtype: int64
In [4]:
a.shape
Out[4]:
(14875, 16)
In [5]:
a.size
Out[5]:
238000
In [6]:
del a['Xylene']
del a['PM10']
del a['NH3']
a
Out[6]:
City Date PM2.5 NO NO2 NOx CO SO2 O3 Benzene Toluene AQI AQI_Bucket
0 Ahmedabad 1/1/2015 NaN 0.92 18.22 17.15 0.92 27.64 133.36 0.00 0.02 NaN NaN
1 Ahmedabad 1/2/2015 NaN 0.97 15.69 16.46 0.97 24.55 34.06 3.68 5.50 NaN NaN
2 Ahmedabad 1/3/2015 NaN 17.40 19.30 29.70 17.40 29.07 30.70 6.80 16.40 NaN NaN
3 Ahmedabad 1/4/2015 NaN 1.70 18.48 17.97 1.70 18.59 36.08 4.43 10.14 NaN NaN
4 Ahmedabad 1/5/2015 NaN 22.10 21.42 37.76 22.10 39.33 39.31 7.01 18.89 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
14870 Visakhapatnam 6/27/2020 15.02 7.68 25.06 19.54 0.47 8.55 23.30 2.24 12.07 41.0 Good
14871 Visakhapatnam 6/28/2020 24.38 3.42 26.06 16.53 0.52 12.72 30.14 0.74 2.21 70.0 Satisfactory
14872 Visakhapatnam 6/29/2020 22.91 3.45 29.53 18.33 0.48 8.42 30.96 0.01 0.01 68.0 Satisfactory
14873 Visakhapatnam 6/30/2020 16.64 4.05 29.26 18.80 0.52 9.84 28.30 0.00 0.00 54.0 Satisfactory
14874 Visakhapatnam 7/1/2020 15.00 0.40 26.85 14.05 0.59 2.10 17.05 NaN NaN 50.0 Good

14875 rows × 13 columns

In [7]:
a.columns
Out[7]:
Index(['City', 'Date', 'PM2.5', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3',
       'Benzene', 'Toluene', 'AQI', 'AQI_Bucket'],
      dtype='object')
In [8]:
a.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14875 entries, 0 to 14874
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        14875 non-null  object 
 1   Date        14875 non-null  object 
 2   PM2.5       12349 non-null  float64
 3   NO          12643 non-null  float64
 4   NO2         12652 non-null  float64
 5   NOx         13692 non-null  float64
 6   CO          14158 non-null  float64
 7   SO2         12545 non-null  float64
 8   O3          12393 non-null  float64
 9   Benzene     13300 non-null  float64
 10  Toluene     12884 non-null  float64
 11  AQI         12209 non-null  float64
 12  AQI_Bucket  12209 non-null  object 
dtypes: float64(10), object(3)
memory usage: 1.5+ MB
In [9]:
a.describe()
Out[9]:
PM2.5 NO NO2 NOx CO SO2 O3 Benzene Toluene AQI
count 12349.000000 12643.000000 12652.000000 13692.000000 14158.000000 12545.000000 12393.000000 13300.000000 12884.000000 12209.000000
mean 59.148547 18.721084 34.743467 35.887924 3.207676 15.199771 36.293217 3.196163 10.071038 171.151118
std 52.944778 23.601786 26.419470 33.602578 9.471177 20.573747 21.384305 8.937824 16.859380 167.429820
min 1.720000 0.060000 0.010000 0.000000 0.000000 0.480000 0.020000 0.000000 0.000000 20.000000
25% 27.740000 6.040000 16.690000 15.737500 0.480000 5.580000 21.390000 0.170000 0.500000 80.000000
50% 44.730000 10.130000 28.295000 25.460000 0.870000 8.880000 33.120000 1.340000 4.080000 113.000000
75% 69.990000 20.860000 44.570000 44.455000 1.400000 15.730000 46.880000 3.820000 12.170000 195.000000
max 685.360000 270.090000 292.020000 467.630000 175.810000 193.860000 257.730000 391.880000 411.520000 2049.000000
In [10]:
a.nunique()
Out[10]:
City            10
Date          2009
PM2.5         7393
NO            4357
NO2           6012
NOx           6307
CO            1622
SO2           3525
O3            5850
Benzene       1547
Toluene       3167
AQI            799
AQI_Bucket       6
dtype: int64
In [11]:
a.mean()
C:\Users\Sridevi\AppData\Local\Temp\ipykernel_12428\1798845826.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  a.mean()
Out[11]:
PM2.5       59.148547
NO          18.721084
NO2         34.743467
NOx         35.887924
CO           3.207676
SO2         15.199771
O3          36.293217
Benzene      3.196163
Toluene     10.071038
AQI        171.151118
dtype: float64
In [12]:
a.fillna(a.mean())
C:\Users\Sridevi\AppData\Local\Temp\ipykernel_12428\3867590745.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  a.fillna(a.mean())
Out[12]:
City Date PM2.5 NO NO2 NOx CO SO2 O3 Benzene Toluene AQI AQI_Bucket
0 Ahmedabad 1/1/2015 59.148547 0.92 18.22 17.15 0.92 27.64 133.36 0.000000 0.020000 171.151118 NaN
1 Ahmedabad 1/2/2015 59.148547 0.97 15.69 16.46 0.97 24.55 34.06 3.680000 5.500000 171.151118 NaN
2 Ahmedabad 1/3/2015 59.148547 17.40 19.30 29.70 17.40 29.07 30.70 6.800000 16.400000 171.151118 NaN
3 Ahmedabad 1/4/2015 59.148547 1.70 18.48 17.97 1.70 18.59 36.08 4.430000 10.140000 171.151118 NaN
4 Ahmedabad 1/5/2015 59.148547 22.10 21.42 37.76 22.10 39.33 39.31 7.010000 18.890000 171.151118 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
14870 Visakhapatnam 6/27/2020 15.020000 7.68 25.06 19.54 0.47 8.55 23.30 2.240000 12.070000 41.000000 Good
14871 Visakhapatnam 6/28/2020 24.380000 3.42 26.06 16.53 0.52 12.72 30.14 0.740000 2.210000 70.000000 Satisfactory
14872 Visakhapatnam 6/29/2020 22.910000 3.45 29.53 18.33 0.48 8.42 30.96 0.010000 0.010000 68.000000 Satisfactory
14873 Visakhapatnam 6/30/2020 16.640000 4.05 29.26 18.80 0.52 9.84 28.30 0.000000 0.000000 54.000000 Satisfactory
14874 Visakhapatnam 7/1/2020 15.000000 0.40 26.85 14.05 0.59 2.10 17.05 3.196163 10.071038 50.000000 Good

14875 rows × 13 columns

In [13]:
a['AQI_Bucket'].replace(np.nan,value='Moderate')
Out[13]:
0            Moderate
1            Moderate
2            Moderate
3            Moderate
4            Moderate
             ...     
14870            Good
14871    Satisfactory
14872    Satisfactory
14873    Satisfactory
14874            Good
Name: AQI_Bucket, Length: 14875, dtype: object
In [14]:
sns.regplot(x=a['CO'],y=a['AQI'])
Out[14]:
<AxesSubplot:xlabel='CO', ylabel='AQI'>
In [15]:
sns.regplot(x=a['CO'],y=a['AQI'])
sns.regplot(x=a['Toluene'],y=a['AQI'])
Out[15]:
<AxesSubplot:xlabel='Toluene', ylabel='AQI'>
In [20]:
g=sns.FacetGrid(a,col="AQI_Bucket",margin_titles=True)
g.map(sns.regplot,"AQI","CO",color=".3",fit_reg=False,x_jitter=.1)
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x2b915119dc0>
In [21]:
sns.heatmap(a.isnull(),yticklabels=False, cbar=False, cmap="Accent_r")
Out[21]:
<AxesSubplot:>
In [22]:
plt.figure(figsize=(12,10))
sns.heatmap(a.corr(),annot=True, cmap='Blues')
Out[22]:
<AxesSubplot:>
In [23]:
plt.figure(figsize=(20,10))
sns.scatterplot(x='City',y='AQI',data=a)
plt.show()
In [24]:
sns.barplot(x='AQI_Bucket',y='AQI',data=a)
plt.show()
In [28]:
print(a.City.unique())
['Ahmedabad' 'Bengaluru' 'Chennai' 'Coimbatore' 'Delhi' 'Hyderabad'
 'Kochi' 'Kolkata' 'Mumbai' 'Visakhapatnam']
In [37]:
plt.figure(figsize=(20,10))
g=sns.FacetGrid(a,col="AQI_Bucket",height=4,aspect=.5)
g.map(sns.barplot,"City","AQI",color=".3",order=["Chennai","Coimbatore"])
Out[37]:
<seaborn.axisgrid.FacetGrid at 0x2b91870cf70>
<Figure size 1440x720 with 0 Axes>
In [38]:
sns.histplot(x="AQI", data=a,color="Purple" )
plt.show()
In [39]:
sns.pairplot(a,hue="AQI")
Out[39]:
<seaborn.axisgrid.PairGrid at 0x2b919b867f0>
In [40]:
sns.pairplot(a,hue="AQI_Bucket")
Out[40]:
<seaborn.axisgrid.PairGrid at 0x2b928de1a90>
In [41]:
g=sns.PairGrid(a,hue="AQI_Bucket")
g.map(sns.scatterplot)
g.add_legend()
Out[41]:
<seaborn.axisgrid.PairGrid at 0x2b9352cfd00>
In [78]:
sns.boxplot(data=a,x="AQI",y="AQI_Bucket")
Out[78]:
<AxesSubplot:xlabel='AQI', ylabel='AQI_Bucket'>
In [79]:
a.corr()
Out[79]:
PM2.5 NO NO2 NOx CO SO2 O3 Benzene Toluene AQI
PM2.5 1.000000 0.582403 0.481107 0.515665 0.136044 0.204288 0.363608 0.202893 0.294136 0.558344
NO 0.582403 1.000000 0.526127 0.761933 0.274019 0.254245 0.145343 0.184412 0.319343 0.494988
NO2 0.481107 0.526127 1.000000 0.671461 0.420938 0.589141 0.347931 0.163130 0.469430 0.632438
NOx 0.515665 0.761933 0.671461 1.000000 0.264610 0.337986 0.259243 0.198954 0.379522 0.512750
CO 0.136044 0.274019 0.420938 0.264610 1.000000 0.594205 0.052150 0.164089 0.412458 0.794315
SO2 0.204288 0.254245 0.589141 0.337986 0.594205 1.000000 0.184961 0.146508 0.467860 0.639325
O3 0.363608 0.145343 0.347931 0.259243 0.052150 0.184961 1.000000 0.034971 0.133195 0.277670
Benzene 0.202893 0.184412 0.163130 0.198954 0.164089 0.146508 0.034971 1.000000 0.376649 0.236064
Toluene 0.294136 0.319343 0.469430 0.379522 0.412458 0.467860 0.133195 0.376649 1.000000 0.486838
AQI 0.558344 0.494988 0.632438 0.512750 0.794315 0.639325 0.277670 0.236064 0.486838 1.000000